8 Enrichment ggtree

Tip

This image showcases an integrated scientific visualization, centralizing a circular phylogenetic tree with annotated nodes, surrounded by associated heatmaps and bar charts that represent gene expression data and functional pathway enrichments.

library(TransProR)
library(ggtree)
library(ggtreeExtra)
library(org.Hs.eg.db)
library(clusterProfiler)
library(ReactomePA)
library(dplyr)
library(ggplot2)
library(ggnewscale)

8.1 Load data

tumor <- readRDS("../test_TransProR/generated_data1/removebatch_SKCM_Skin_TCGA_exp_tumor.rds")
normal <- readRDS('../test_TransProR/generated_data1/removebatch_SKCM_Skin_Normal_TCGA_GTEX_count.rds')
# Merge the datasets, ensuring both have genes as row names
all_count_exp <- merge(tumor, normal, by = "row.names")
all_count_exp <- tibble::column_to_rownames(all_count_exp, var = "Row.names")  # Set the row names

# Drawing data
all_count_exp <- log_transform(all_count_exp)

[1] "log2 transform finished"

DEG_deseq2 <- readRDS('../test_TransProR/Select DEGs/DEG_deseq2.Rdata')
#head(all_count_exp, 1)
head(DEG_deseq2, 5)

          baseMean log2FoldChange      lfcSE      stat pvalue padj change
A1BG      134.6084      -2.549682 0.05677219 -44.91075      0    0   down
A2M     30208.9912       3.251168 0.08394645  38.72907      0    0     up
AADACL2   801.4538      -8.229956 0.18969649 -43.38486      0    0   down
AARS2    1153.5493       1.624753 0.03283522  49.48202      0    0 stable
AARSD1    567.8672      -2.082289 0.02275703 -91.50088      0    0 stable

8.2 Convert from SYMBOL to ENTREZID

# Convert from SYMBOL to ENTREZID for convenient enrichment analysis later. It's crucial to do this now as a direct conversion may result in a reduced set of genes due to non-one-to-one correspondence.

# DEG_deseq2
# Retrieve gene list
gene <- rownames(DEG_deseq2)
# Perform conversion
gene = bitr(gene, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")

'select()' returned 1:many mapping between keys and columns

Warning in bitr(gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb =
"org.Hs.eg.db"): 43.37% of input gene IDs are fail to map...

# Remove duplicates and merge
gene <- dplyr::distinct(gene, SYMBOL, .keep_all=TRUE)
# Extract the SYMBOL column as a vector from the first dataset
symbols_vector <- gene$SYMBOL
# Use the SYMBOL column to filter corresponding rows from the second dataset by row names
DEG_deseq2 <- DEG_deseq2[rownames(DEG_deseq2) %in% symbols_vector, ]
head(DEG_deseq2, 5)

          baseMean log2FoldChange      lfcSE      stat pvalue padj change
A1BG      134.6084      -2.549682 0.05677219 -44.91075      0    0   down
A2M     30208.9912       3.251168 0.08394645  38.72907      0    0     up
AADACL2   801.4538      -8.229956 0.18969649 -43.38486      0    0   down
AARS2    1153.5493       1.624753 0.03283522  49.48202      0    0 stable
AARSD1    567.8672      -2.082289 0.02275703 -91.50088      0    0 stable

8.3 Select differentially expressed genes

Diff_deseq2 <- filter_diff_genes(
  DEG_deseq2, 
  p_val_col = "pvalue", 
  log_fc_col = "log2FoldChange",
  p_val_threshold = 0.01, 
  log_fc_threshold = 9.1
  )
# First, obtain a list of gene names from the row names of the first dataset
gene_names <- rownames(Diff_deseq2)
# Find the matching rows in the second dataframe
matched_rows <- all_count_exp[gene_names, ]
# Calculate the mean for each row
averages <- rowMeans(matched_rows, na.rm = TRUE)
# Append the averages as a new column to the first dataframe
Diff_deseq2$average <- averages
Diff_deseq2$ID <- rownames(Diff_deseq2)
Diff_deseq2$changetype <- ifelse(Diff_deseq2$change == 'up', 1, -1)
# Define a small threshold value
small_value <- .Machine$double.xmin
# Before calculating -log10, replace zeroes with the small threshold value and assign it to a new column
Diff_deseq2$log_pvalue <- ifelse(Diff_deseq2$pvalue == 0, -log10(small_value), -log10(Diff_deseq2$pvalue))
# Extract the expression data corresponding to the differentially expressed genes
heatdata_deseq2 <- all_count_exp[rownames(Diff_deseq2), ]
#head(heatdata_deseq2, 1)

8.4 Process heatdata for ggtree plotting

set.seed(123)
HeatdataDeseq2 <- process_heatdata(
  heatdata_deseq2, 
  selection = 2, 
  custom_names = NULL, 
  num_names_per_group = 4, 
  prefix_length = 4
  )
head(HeatdataDeseq2, 5)

                  TCGA_1   TCGA_2   TCGA_3   TCGA_4    GTEX_1    GTEX_2
ADIRF            0.00000  0.00000 0.000000 0.000000 10.426265  7.954196
ATP6V1G2-DDX39B  0.00000  0.00000 0.000000 2.000000  8.044394 10.769011
BANCR           10.82655 10.68299 8.997179 5.491853  0.000000  0.000000
BOLA2            0.00000  0.00000 0.000000 0.000000  6.700440  7.693487
C1QTNF5          0.00000  0.00000 0.000000 0.000000  8.134426  8.658211
                   GTEX_3    GTEX_4
ADIRF           10.194757 10.817783
ATP6V1G2-DDX39B  5.554589  8.335390
BANCR            0.000000  1.000000
BOLA2            6.554589  7.066089
C1QTNF5          7.266787  8.071462

8.5 Check nodes requiring staining

HclustDeseq2 <- hclust(dist(HeatdataDeseq2, method = "euclidean"), method = "complete")
p1 = ggtree(HclustDeseq2, branch.length = 'none', layout = "circular", size = 0.2, xlim = c(30, NA))
p1

# Examine node points, note the x-coordinate in this df (tree_data)
# Convert data generated by ggtree into a dataframe
tree_data <- as.data.frame(p1$data)
p2 <- rotate_tree(p1, 90)

Coordinate system already present. Adding new coordinate system, which will
replace the existing one.

p2

# Review nodes
p3 <- p2 + geom_text(aes(label = node)) + geom_tiplab(offset = 1, size = 3, hjust = -0.1)
p3

8.6 Usage Example

# Please replace the following vectors with your specific values
nodes <- c(117, 129, 125, 127, 119, 123, 139, 166, 124, 131, 217) # x-values of the nodes you want to highlight
fill_colors <- c("#CD6600", "#CD6600", "#CD6600", "#CD6600", "#009933", "#009933", "#009933", "#009933", "#9B30FF", "#9B30FF", "#9B30FF") # Fill colors
alpha_values <- c(0.3, 0.3, 0.3, 0.3, 0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3) # Transparency values
extend_values <- c(25, 24, 24, 25, 25, 25, 24, 24, 25, 24, 24) # Values for the 'extend' parameter

# Now, you can call this function to create your ggtree layer

p2 <- highlight_by_node(
  p2, 
  nodes, 
  fill_colors, 
  alpha_values, 
  extend_values
  )

p2

8.7 Diff_deseq2 Enrichment Analysis

8.7.1 Obtain the list of genes

gene <- rownames(Diff_deseq2)
## Convert
gene = bitr(gene, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")

'select()' returned 1:1 mapping between keys and columns

## Remove duplicates and merge
gene <- dplyr::distinct(gene, SYMBOL, .keep_all=TRUE)
gene_df <- data.frame(logFC=Diff_deseq2$log2FoldChange,
                      SYMBOL = rownames(Diff_deseq2))
gene_df <- merge(gene_df, gene, by="SYMBOL")
GO_deseq2 <- gene_df$ENTREZID

8.7.2 GO Analysis for Biological Processes (BP)

# Perform gene enrichment analysis
erich.go.BP_deseq2 <- enrichGO(
  gene = GO_deseq2,
  OrgDb = org.Hs.eg.db,
  keyType = "ENTREZID",
  ont = 'BP', # Other categories: "CC", "MF" for molecular function
  pvalueCutoff = 0.05,
  qvalueCutoff = 0.05,
  readable = TRUE)
erich.go.BP.outdata_deseq2 <- as.data.frame(erich.go.BP_deseq2)
# Uncomment to write output to CSV
# write.csv(erich.go.BP.outdata_deseq2, "E:/fruit/erich.go.BP.outdata.csv")
head(erich.go.BP.outdata_deseq2, 5)

                   ID
GO:0016064 GO:0016064
GO:0019724 GO:0019724
GO:0002377 GO:0002377
GO:0002449 GO:0002449
GO:0002460 GO:0002460
                                                                                                                         Description
GO:0016064                                                                                   immunoglobulin mediated immune response
GO:0019724                                                                                                  B cell mediated immunity
GO:0002377                                                                                                 immunoglobulin production
GO:0002449                                                                                              lymphocyte mediated immunity
GO:0002460 adaptive immune response based on somatic recombination of immune receptors built from immunoglobulin superfamily domains
           GeneRatio   BgRatio       pvalue     p.adjust       qvalue
GO:0016064     14/80 205/18870 1.639824e-13 6.821238e-11 6.653066e-11
GO:0019724     14/80 208/18870 2.003300e-13 6.821238e-11 6.653066e-11
GO:0002377     12/80 196/18870 3.655475e-11 8.297927e-09 8.093349e-09
GO:0002449     14/80 368/18870 4.233820e-10 7.208078e-08 7.030369e-08
GO:0002460     14/80 380/18870 6.427551e-10 8.754324e-08 8.538494e-08
                                                                                                                                 geneID
GO:0016064 IGHV1-18/IGHV1-24/IGHV1-58/IGHV1-69/IGHV1-69-2/IGHV2-26/IGHV3-15/IGHV3-21/IGHV3-23/IGHV4-31/IGHV4-34/IGHV4-39/IGHV5-51/TREX1
GO:0019724 IGHV1-18/IGHV1-24/IGHV1-58/IGHV1-69/IGHV1-69-2/IGHV2-26/IGHV3-15/IGHV3-21/IGHV3-23/IGHV4-31/IGHV4-34/IGHV4-39/IGHV5-51/TREX1
GO:0002377                          IGKV1-16/IGKV4-1/IGLV1-40/IGLV2-23/IGLV3-10/IGLV3-19/IGLV3-21/IGLV3-25/IGLV4-60/IGLV4-69/TREX1/XBP1
GO:0002449 IGHV1-18/IGHV1-24/IGHV1-58/IGHV1-69/IGHV1-69-2/IGHV2-26/IGHV3-15/IGHV3-21/IGHV3-23/IGHV4-31/IGHV4-34/IGHV4-39/IGHV5-51/TREX1
GO:0002460 IGHV1-18/IGHV1-24/IGHV1-58/IGHV1-69/IGHV1-69-2/IGHV2-26/IGHV3-15/IGHV3-21/IGHV3-23/IGHV4-31/IGHV4-34/IGHV4-39/IGHV5-51/TREX1
           Count
GO:0016064    14
GO:0019724    14
GO:0002377    12
GO:0002449    14
GO:0002460    14

8.7.3 GO Analysis for Molecular Functions (MF)

# Perform gene enrichment analysis
erich.go.MF_deseq2 <- enrichGO(
  gene = GO_deseq2,
  OrgDb = org.Hs.eg.db,
  keyType = "ENTREZID",
  ont = 'MF', # Other categories: "CC", "MF" for molecular function
  pvalueCutoff = 0.05,
  qvalueCutoff = 0.05,
  readable = TRUE)

erich.go.MF.outdata_deseq2 <- as.data.frame(erich.go.MF_deseq2)
# Uncomment to write output to CSV
# write.csv(erich.go.MF.outdata_deseq2, "E:/fruit/erich.go.MF.outdata.csv")

head(erich.go.MF.outdata_deseq2, 5)

                   ID                              Description GeneRatio
GO:0003823 GO:0003823                          antigen binding     21/87
GO:0030280 GO:0030280 structural constituent of skin epidermis      4/87
GO:0042826 GO:0042826              histone deacetylase binding      5/87
             BgRatio       pvalue     p.adjust       qvalue
GO:0003823 178/18496 5.857929e-24 7.146673e-22 7.146673e-22
GO:0030280  36/18496 2.397665e-05 1.462576e-03 1.462576e-03
GO:0042826 127/18496 3.322487e-04 1.351145e-02 1.351145e-02
                                                                                                                                                                                                geneID
GO:0003823 IGHV1-18/IGHV1-24/IGHV1-58/IGHV1-69/IGHV1-69-2/IGHV2-26/IGHV3-15/IGHV3-21/IGHV3-23/IGHV4-31/IGHV4-34/IGHV4-39/IGHV5-51/IGKV1-16/IGKV4-1/IGLV1-40/IGLV2-23/IGLV3-19/IGLV3-21/IGLV3-25/TRBV28
GO:0030280                                                                                                                                                                   KRT2/KRT71/KRT85/KRTAP1-3
GO:0042826                                                                                                                                                         MAGEA1/MAGEA12/MAGEA3/MAGEA4/MAGEA6
           Count
GO:0003823    21
GO:0030280     4
GO:0042826     5

8.7.4 GO Analysis for Cellular Components (CC)

# Perform gene enrichment analysis
erich.go.CC_deseq2 <- enrichGO(
  gene = GO_deseq2,
  OrgDb = org.Hs.eg.db,
  keyType = "ENTREZID",
  ont = 'CC', # Other categories: "CC", "MF" for molecular function
  pvalueCutoff = 0.05,
  qvalueCutoff = 0.05,
  readable = TRUE)

erich.go.CC.outdata_deseq2 <- as.data.frame(erich.go.CC_deseq2)
# Uncomment to write output to CSV
# write.csv(erich.go.CC.outdata_deseq2, "E:/fruit/erich.go.CC.outdata.csv")

8.7.5 KEGG Analysis

kegg.out_deseq2 = enrichKEGG(
  gene = GO_deseq2,
  organism = "hsa",
  keyType = "kegg",
  pvalueCutoff = 0.05,
  pAdjustMethod = "BH",
  qvalueCutoff = 0.05)

Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...

Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...

kegg.out.outdata_deseq2 <- as.data.frame(kegg.out_deseq2)
# Uncomment to export the data, which are in ENTREZID format
# write.csv(kegg.out.outdata_deseq2, "E:/kegg.out.outdata.csv")

##### Convert numeric Entrez gene IDs or Ensembl gene IDs from above code to symbols
library(org.Hs.eg.db)
kegg.out1_deseq2 = as.data.frame(kegg.out_deseq2)
ENTREZID = strsplit(kegg.out1_deseq2$geneID, "/")
symbol = sapply(ENTREZID, function(x) {
  y = bitr(x, fromType = "ENTREZID", toType = "SYMBOL", OrgDb = "org.Hs.eg.db")
  # In case of multiple matches, take the first one
  y = y[!duplicated(y$ENTREZID), -1]
  y = paste(y, collapse = "/")
})
kegg.out1_deseq2$geneID = symbol
kegg.out1.outdata_deseq2 <- as.data.frame(kegg.out1_deseq2)
# Uncomment to export the converted data
# write.csv(kegg.out1.outdata_deseq2, "E:/fruit/kegg.out1.outdata.csv")
head(kegg.out.outdata_deseq2, 5)

 [1] category    subcategory ID          Description GeneRatio   BgRatio    
 [7] pvalue      p.adjust    qvalue      geneID      Count      
<0 rows> (or 0-length row.names)

head(kegg.out1.outdata_deseq2, 5)

 [1] category    subcategory ID          Description GeneRatio   BgRatio    
 [7] pvalue      p.adjust    qvalue      geneID      Count      
<0 rows> (or 0-length row.names)

8.7.6 DO Analysis

erich.go.DO_deseq2 = enrichDO(gene = GO_deseq2,
                              ont = "DO", # Other categories: "CC", "MF" for molecular function
                              pvalueCutoff = 0.05,
                              qvalueCutoff = 0.05,
                              readable = TRUE)

erich.go.DO.outdata_deseq2 <- as.data.frame(erich.go.DO_deseq2)
# Uncomment to export the data
# write.csv(erich.go.DO.outdata_deseq2, "E:/fruit/erich.go.DO.outdata.csv")
head(erich.go.DO.outdata_deseq2, 5)

                   ID           Description GeneRatio  BgRatio       pvalue
DOID:11132 DOID:11132 prostatic hypertrophy      3/34 34/10312 0.0001827545
             p.adjust     qvalue               geneID Count
DOID:11132 0.02668216 0.02481614 MAGEA1/MAGEA3/MAGEA4     3

8.7.7 Reactome Pathway Analysis

erich.go.Reactome_deseq2 <- enrichPathway(gene = GO_deseq2, pvalueCutoff = 0.05, readable = TRUE)

erich.go.Reactome.outdata_deseq2 <- as.data.frame(erich.go.Reactome_deseq2)
# Uncomment to export the data
# write.csv(erich.go.Reactome.outdata_deseq2, "E:/fruit/erich.go.Reactome.outdata.csv")
head(erich.go.Reactome.outdata_deseq2, 5)

                         ID                         Description GeneRatio
R-HSA-6805567 R-HSA-6805567                      Keratinization     18/43
R-HSA-6809371 R-HSA-6809371 Formation of the cornified envelope      6/43
                BgRatio       pvalue     p.adjust       qvalue
R-HSA-6805567 214/11009 3.032346e-20 2.850405e-18 2.649312e-18
R-HSA-6809371 129/11009 9.843111e-06 4.626262e-04 4.299885e-04
                                                                                                                                                        geneID
R-HSA-6805567 CDSN/KRT2/KRT25/KRT35/KRT71/KRT85/KRTAP1-3/KRTAP10-5/KRTAP11-1/KRTAP2-1/KRTAP2-2/KRTAP2-4/KRTAP26-1/KRTAP3-3/KRTAP4-2/KRTAP8-1/KRTAP9-3/KRTAP9-8
R-HSA-6809371                                                                                                                CDSN/KRT2/KRT25/KRT35/KRT71/KRT85
              Count
R-HSA-6805567    18
R-HSA-6809371     6

8.7.8 Filter and organize data by setting the count values of genes in pathways or by selecting pathway names of interest to the user.

# DEG_deseq2
## Conversion
GO_deseq2 = bitr(GO_deseq2, fromType="ENTREZID", toType="SYMBOL", OrgDb="org.Hs.eg.db")

'select()' returned 1:1 mapping between keys and columns

GO_deseq2 <- GO_deseq2$SYMBOL

# Usage Example
# count
count_threshold <- 12
result_threshold_deseq2 <- pathway_count(GO_deseq2, count_threshold, erich.go.BP.outdata_deseq2)
# Print the results
head(result_threshold_deseq2, 5)

           Symble                             Description Exists
1           ADIRF immunoglobulin mediated immune response      0
2 ATP6V1G2-DDX39B immunoglobulin mediated immune response      0
3           BANCR immunoglobulin mediated immune response      0
4           BOLA2 immunoglobulin mediated immune response      0
5         C1QTNF5 immunoglobulin mediated immune response      0

# List of selected pathway names
selected_pathways_names <- c("immunoglobulin production", "production of molecular mediator of immune response")
# Use function
result_names_deseq2 <- pathway_description(GO_deseq2, selected_pathways_names, erich.go.BP.outdata_deseq2)
# View the results
head(result_names_deseq2, 5)

           Symble               Description Exists
1           ADIRF immunoglobulin production      0
2 ATP6V1G2-DDX39B immunoglobulin production      0
3           BANCR immunoglobulin production      0
4           BOLA2 immunoglobulin production      0
5         C1QTNF5 immunoglobulin production      0

8.7.9 Label highly and lowly expressed genes and annotate corresponding colors

selected_genes_deseq2 <- result_threshold_deseq2 %>%
  dplyr::filter(Exists == 1) %>%
  dplyr::select(Symble) %>%
  dplyr::distinct()

# Invoke the function
result_deseq2 <- gene_color(selected_genes_deseq2, Diff_deseq2, "#0000EE", "#fc4746")

# Add gene highlights to the plot
add_gene_highlights_p3 <- highlight_genes(p2, result_deseq2, hilight_extend = 26)

add_gene_highlights_p3

8.7.10 heatmap

p4<- gheatmap(
  add_gene_highlights_p3+ geom_tiplab(offset=15,size=2.5),
  HeatdataDeseq2,
  width = 1, 
  colnames_offset_y = 0.5,
  font.size= 2, 
  hjust = 0)+ scale_fill_gradientn(colors = c(low = "#c2d5e5", high = "steelblue"))

Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.

p4

8.7.11 ggtreeExtra::geom_fruit

## Shorten a name that was too long
result_threshold_deseq2$Description <- gsub("adaptive immune response based on somatic recombination of immune receptors built from immunoglobulin superfamily domains",
                                            "adaptive immune response", result_threshold_deseq2$Description)

# Enhance the visualization with additional scale and elements
p7 <- p4 + new_scale_fill() +
  geom_fruit(data = result_threshold_deseq2, geom = geom_tile,
              mapping = aes(y = Symble, x = Description, alpha = Exists, fill = Description),
              color = "grey50", pwidth = 0.5, offset = 1.02, size = 0.02) +
  scale_alpha_continuous(range = c(0.2, 0.8),
                          guide = guide_legend(keywidth = 0.3, keyheight = 0.3,  order = 2)) +
  scale_fill_manual(values = c("#CD6600", "#009933", "#0000EE", "#9B30FF", "#FF4040"), guide = guide_legend(keywidth = 0.3, keyheight = 0.3, order = 2))

p7

8.7.12 Bar plot section

# offset: Adjust the spacing between each bar
p8 <- p7 + new_scale_fill() + 
  geom_fruit(data = Diff_deseq2, geom = geom_bar, mapping = aes(y = ID, x = log_pvalue, fill = "log_pvalue"), pwidth = 0.3, offset = 0.1, orientation = "y", stat = "identity") +
  geom_fruit(data = Diff_deseq2, geom = geom_bar, mapping = aes(y = ID, x = log2FoldChange, fill = "log2FoldChange"), pwidth = 0.3, offset = 0.2, orientation = "y", stat = "identity") +
  geom_fruit(data = Diff_deseq2, geom = geom_bar, mapping = aes(y = ID, x = average, fill = "average"), pwidth = 0.3, offset = 0.04, orientation = "y", stat = "identity") +
  scale_fill_manual(values = c("log_pvalue" = "#87CEFA", "log2FoldChange" = "#FFC125", "average" = "#7B68EE"))

p8

9 Reference

G Yu, DK Smith, H Zhu, Y Guan, TTY Lam*. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution. 2017, 8(1):28-36.

G Yu, TTY Lam, H Zhu, Y Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution, 2018, 35(2):3041-3043.

S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L Zhan, T Wu, E Hu, Y Jiang, X Bo and G Yu*. ggtreeExtra: Compact visualization of richly annotated phylogenetic data. Molecular Biology and Evolution. 2021, 38(9):4039-4042.

T Wu#, E Hu#, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141.

G Yu, QY He*. ReactomePA: an R/Bioconductor package for reactome pathway analysis and visualization. Molecular BioSystems 2016, 12(2):477-479.

G Yu, LG Wang, GR Yan, QY He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609.